import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from math import sqrt, log
from copy import deepcopy
from collections import defaultdict
from xgboost import XGBClassifier
# Import libraries
from sklearn import datasets
from sklearn.model_selection import (train_test_split,
StratifiedKFold
)
from sklearn.metrics import (accuracy_score,
f1_score,
precision_score,
recall_score,
classification_report,
confusion_matrix,
roc_curve,
auc,
matthews_corrcoef,
roc_auc_score
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_pipeline_imb, Pipeline
from sklearn.ensemble import AdaBoostClassifier,ExtraTreesClassifier
from sklearn import tree, manifold, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import validation_curve
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.decomposition import PCA
from hyperopt import hp
from hyperopt.pyll import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
sns.set_style(style="whitegrid")
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)
colormap = plt.cm.RdBu
# read in the data as a pandas dataframe
loans = pd.read_csv("Data/Assignment2Data.csv")
loans.head()
print ("\033[1m" + "Rows : " + "\033[0m", loans.shape[0])
print ("\033[1m" + "Columns : " + "\033[0m", loans.shape[1])
print ("\033[1m" + "\nColumn Names : \n" + "\033[0m", loans.columns.tolist())
# see the data types
loans.info()
Hence, all features are numeric.
missing_values = loans.isna().sum()
counter = 0
for val in missing_values:
if val > 0:
counter += 0
print(counter)
Observation: There are no missing values.
loans_viz = deepcopy(loans) # maintain a loans_viz dataframe purely for EDA and visualisations
loans_viz["bad_loans"].value_counts()
# Pie Chart showing percentages
plt.figure(figsize=(14,6))
plt.subplot(121)
loans_viz["bad_loans"].value_counts().plot.pie(autopct = "%.2f%%", fontsize=12)
plt.title("Bad Loans Pie Chart")
plt.ylabel("")
# Bar Chart Raw Number
plt.subplot(122)
ax = sns.countplot(x="bad_loans", data=loans_viz)
for i, j in enumerate(loans["bad_loans"].value_counts().values):
ax.annotate(str(j), (i, j), ha="center", va="bottom", fontsize=12)
plt.title("Bad Loans Bar Chart")
plt.show()
cols = list(loans_viz.columns)
cols.remove('bad_loans')
fig = plt.figure(constrained_layout=False, figsize=(20,100))
spec = fig.add_gridspec(24, 3, wspace=0.3, hspace=0.3)
for i in range(len(cols)):
axes = fig.add_subplot(spec[i])
sns.boxplot(x=loans_viz["bad_loans"],y=loans_viz[cols[i]],ax=axes,color='lightgray')
sns.stripplot(x=loans_viz["bad_loans"],y=loans_viz[cols[i]],ax=axes)
fig.add_subplot(axes)
axes.title.set_text(cols[i])
fig.show()
fig = plt.figure(constrained_layout=False, figsize=(20,100))
spec = fig.add_gridspec(24, 3, wspace=0.3, hspace=0.3)
for i in range(len(cols)):
axes = fig.add_subplot(spec[i])
sns.distplot(loans_viz[loans_viz['bad_loans'] == 0][cols[i]],
color='g', label="Good Loans")
sns.distplot(loans_viz[loans_viz['bad_loans'] == 1][cols[i]],
color='r', label="Bad Loans")
fig.add_subplot(axes)
axes.title.set_text('Bad x Normal Loans by {}'.format(cols[i]))
plt.legend(prop={'size': 10})
fig.show()
Observation: There are some features which distinctly separates the class (bad and normal loans). We belive these features will be more statistically significant.
Summary:
fig = plt.figure(constrained_layout=False, figsize=(20,6))
sns.distplot(loans_viz.annual_inc)
plt.title('Distribution of Annual Income')
fig.show()
fig = plt.figure(constrained_layout=False, figsize=(20,6))
sns.distplot(loans_viz[loans_viz['bad_loans'] == 0]["annual_inc"],
color='g', label="Good Loans")
sns.distplot(loans_viz[loans_viz['bad_loans'] == 1]["annual_inc"],
color='r', label="Bad Loans")
plt.title('Bad x Normal Loans by Annual Income')
fig.show()
def categorise_income(income):
if income > 200000:
return "High"
elif income > 100000:
return "Medium"
else:
return "Low"
loans_viz["income_category"] = loans_viz.annual_inc.apply(categorise_income)
loans_viz.income_category.value_counts()
fig, ((ax1, ax2), (ax3, ax4))= plt.subplots(nrows=2, ncols=2, figsize=(14,6))
# Change the Palette types tomorrow!
sns.violinplot(x="income_category", y="loan_amnt", data=loans_viz, palette="Set2", ax=ax1 )
sns.violinplot(x="income_category", y="bad_loans", data=loans_viz, palette="Set2", ax=ax2)
sns.violinplot(x="income_category", y="int_rate", data=loans_viz, palette="Set2", ax=ax3)
Observation: The higher the income, the higher the loan amount. The higher the income, the lower the interest rate.
income_bad = loans_viz[loans_viz["bad_loans"] == 1]["income_category"].value_counts().reset_index()
income_bad.columns = ["income_category", "count"]
income_good = loans_viz[loans_viz["bad_loans"] == 0]["income_category"].value_counts().reset_index()
income_good.columns = ["income_category", "count"]
# bar - bad loans
trace1 = go.Bar(x = income_bad["income_category"] , y = income_bad["count"],
name = "Customers with Bad Loans",
marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
# bar - good loans
trace2 = go.Bar(x = income_good["income_category"] , y = income_good["count"],
name = "Customers with Good Loans",
marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
layout = go.Layout(dict(title = "Bad Loans by Income Category",
plot_bgcolor = "rgb(243,243,243)",paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = "Income Category",
zerolinewidth=1,ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = "count",
zerolinewidth=1,ticklen=5,gridwidth=2),
)
)
data = [trace2,trace1]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
Observation: There are more customers with bad loans in the low income category as compared to the other categories. However, this may not be reflected when comparing the proportion of the bad loans in each income category.
loans_viz["bad_loans"] = loans_viz["bad_loans"].map(lambda x : "Yes" if x == 1 else "No")
plt.figure(figsize=(18,12))
features = ['annual_inc','loan_amnt','int_rate','bad_loans']
sns.pairplot(loans_viz[features],hue="bad_loans",height=3)
plt.show()
Observation: There is no clear pairwise correlation among the loan_amount, int_rate and annual_income. Nonetheless, we can see a clear separation in classes (bad and normal loans) when analysing the relation between loan_amnt and annual_inc as well as int_rate and annual_inc. This is due to the fact that there are more bad loans in the lower income group.
loans_viz["int_rate"].describe()
# Average interest is 14.59%. Anything above this will be considered of high risk. Let's see if this is true.
loans_viz["int_rate_type"] = loans_viz["int_rate"].map(lambda x: "Low" if x <= np.mean(loans_viz.int_rate) else "High")
loans_viz.head()
plt.figure(figsize=(20,6))
ax = sns.countplot(x='int_rate_type', data=loans_viz,
palette="Set2", hue='bad_loans')
ax.set_title('The impact of interest rate on the condition of the loan', fontsize=16)
ax.set_xlabel('Level of Interest Payments', fontsize=14)
ax.set_ylabel('Count', fontsize=14)
plt.show()
Observation: Loans that have a high interest rate are more likely to default.
# We will use df_correlations dataframe to analyze our correlations.
df_correlations = loans_viz[[i for i in loans_viz.columns if i not in ["bad_loans", "income_category"]]].corr()
trace = go.Heatmap(z=df_correlations.values,
x=df_correlations.columns,
y=df_correlations.columns,
colorscale=[[0.0, 'rgb(165,0,38)'],
[0.1111111111111111, 'rgb(215,48,39)'],
[0.2222222222222222, 'rgb(244,109,67)'],
[0.3333333333333333, 'rgb(253,174,97)'],
[0.4444444444444444, 'rgb(254,224,144)'],
[0.5555555555555556, 'rgb(224,243,248)'],
[0.6666666666666666, 'rgb(171,217,233)'],
[0.7777777777777778, 'rgb(116,173,209)'],
[0.8888888888888888, 'rgb(69,117,180)'],
[1.0, 'rgb(49,54,149)']
],
colorbar = dict(title = 'Level of Correlation',
titleside = 'top',
tickmode = 'array',
tickvals = [-0.52,0.2,0.95],
ticktext = ['Negative Correlation','Low Correlation','Positive Correlation'],
ticks = 'outside'
)
)
layout = {"title": "Correlation Heatmap"}
data=[trace]
fig = dict(data=data, layout=layout)
py.iplot(fig)
Observation: It is difficult to analyse the correlation of all 70 variables. Instead, we will look at the correlation heatmap of the more significant features later on in the notebook.
loans.head()
loans_viz
# Split the dataset: 80% for train, 20% for test
output = "bad_loans"
X = loans.drop(output, axis=1)
y = loans[output]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234, stratify=y)
features = X_train.columns
X_train_scaled = deepcopy(X_train)
X_test_scaled = deepcopy(X_test)
# Normalize features
scaler = StandardScaler()
scaler = scaler.fit(X_train)
X_train_scaled[features] = scaler.transform(X_train)
X_test_scaled[features] = scaler.transform(X_test)
X_train_scaled.head()
X_train_scaled.columns
num_cols = list(X_train_scaled.columns)
#dataframe with non negative values
df_x = X_train_scaled
df_y = y_train
#fit model with k= all
select = SelectKBest(score_func = f_classif,k = 10)
fit = select.fit(df_x,df_y)
#create dataframe
score = pd.DataFrame({"features":cols,"scores":fit.scores_,"p_values":fit.pvalues_ })
score = score.sort_values(by = "scores" ,ascending =False)
#plot
trace = go.Scatter(x = score["features"],
y = score["scores"],
name = "Feature Scores",mode = "lines+markers",
marker = dict(color = "red",line = dict(width =1))
)
layout = go.Layout(dict(title = "Scores for Features",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
tickfont = dict(size =10),
domain=[0, 1],
tickangle = 90,zerolinewidth=1,
ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "scores",
zerolinewidth=1,ticklen=5,gridwidth=2),
margin = dict(b=200)
)
)
data=[trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
print("P-value of Insignificant Features")
for index,row in score.iterrows():
if row[2] > 0.05:
print(row[0],":",row[2])
score
len(score[score.p_values < 0.05])
top10_feats = list(score["features"][:10])
top10_feats
X_train_scaled.head()
X_train_reduced = X_train_scaled[top10_feats]
pca = PCA()
pca.fit(X_train_reduced)
pca_data = pca.transform(X_train_reduced)
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
labels=['PC' + str(testing) for testing in range(1, len(per_var) + 1)]
cum_var = np.cumsum(per_var)
plt.figure(figsize=(10, 5))
plt.bar(range(len(per_var)), per_var, alpha=0.3, align='center', label='individual explained variance', color = 'g',tick_label=labels)
plt.step(range(len(cum_var)), cum_var, where='mid',label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.show();
labels
### pca.components_ is the set of all eigenvectors (aka loadings) for your projection space
features_components = pd.DataFrame(pca.components_, columns=X_train_reduced.columns, index = labels)
plt.figure(figsize=(12,10))
sns.heatmap(features_components, vmax=1, square=True,annot=True,cmap=colormap, linecolor='white')
plt.title('Correlation between features and components')
Observation: The correlation heatmap above allows us to see the breakdown of each Principal Component and potentially aid in its interpretability i.e. is there a particular set of features that are heavily weighted in the Principal Component
pca_df = pd.DataFrame(pca_data, columns = labels)
pca_df.head()
y_train
pca_df = pca_df.join(y_train.reset_index().drop("index", axis=1)[["bad_loans"]])
pca_df.head()
plt.figure(figsize=(20,6))
ax = sns.scatterplot(x="PC1", y="PC2", hue="bad_loans", data=pca_df)
plt.show()
Observation: Combining both PC1 and PC2 gives a total variation of around 70%. Plotting the 2 PCs, we can see there is clearly separation between the good loans and bad loans.
X_train_reduced
train_set_reduced = X_train_reduced.join(y_train)
time_start = time.time()
train_set_reduced_sne = deepcopy(train_set_reduced)
tsne = manifold.TSNE(n_components=2, verbose=1, perplexity=10, n_iter=5000,learning_rate=50,random_state=1234)
tsne_results = tsne.fit_transform(train_set_reduced)
train_set_reduced_sne['tsne-2d-one'] = tsne_results[:,0]
train_set_reduced_sne['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(16,10))
flatui = ["#00FFFF","#FF0000"]
sns.scatterplot(
x="tsne-2d-one", y="tsne-2d-two",
hue='bad_loans',
palette=sns.color_palette(flatui, 2),
data=train_set_reduced_sne,
legend="full",
alpha=0.3
)
Observation: There is also a clear separation between the good and bad loans.
X_train_reduced.corr()
plt.figure(figsize=(12,10))
sns.heatmap(X_train_reduced.corr(), vmax=1, square=True,annot=True,cmap=colormap, linecolor='white')
plt.title('Correlation of Top 10 Features')
top5_feats = ['out_prncp','total_rec_prncp','int_rate','total_rec_late_fee','tot_hi_cred_lim']
plt.figure(figsize=(12,10))
sns.heatmap(loans[top5_feats].corr(), vmax=1, square=True,annot=True,cmap=colormap, linecolor='white')
plt.title('Correlation of Top 5 Features')
def cross_validation_smote_modified(splits, X_train, y_train, classifier):
skf = StratifiedKFold(n_splits=splits,random_state=1234)
accuracy = []
recall = []
precision = []
f1 = []
roc_auc = []
mcc = []
for train_index, test_index in skf.split(X_train, y_train):
# train-test split in each fold
X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
# SMOTE oversampling on the training set of the fold
pipeline = make_pipeline_imb(SMOTE(random_state = 1234), classifier)
model = pipeline.fit(X_train_cv, y_train_cv)
y_pred = model.predict(X_test_cv)
accuracy.append(accuracy_score(y_test_cv, y_pred))
recall.append(recall_score(y_test_cv, y_pred))
precision.append(precision_score(y_test_cv, y_pred))
f1.append(f1_score(y_test_cv, y_pred))
roc_auc.append(roc_auc_score(y_test_cv, y_pred))
mcc.append(matthews_corrcoef(y_test_cv, y_pred))
average_result = {}
average_result["accuracy"] = np.mean(accuracy)
average_result["recall_score"] = np.mean(recall)
average_result["precision"] = np.mean(precision)
average_result["f1_score"] = np.mean(f1)
average_result["roc_auc_score"] = np.mean(roc_auc)
average_result["mcc"] = np.mean(mcc)
full_result = {}
full_result["accuracy"] = accuracy
full_result["recall_score"] = recall
full_result["precision"] = precision
full_result["f1_score"] = f1
full_result["roc_auc_score"] = roc_auc
full_result["mcc"] = mcc
return average_result,full_result
def cm_and_roc(confusion_matrix, class_names, Y_test, testPredict, figsize = (15,5), fontsize=12):
df_cm = pd.DataFrame(
confusion_matrix, index=class_names, columns=class_names,
)
fig = plt.figure(figsize=figsize)
# Confusion Matrix
plt.subplot(121)
heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title("Confusion Matrix")
# ROC
plt.subplot(122)
# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(Y_test, testPredict) # fpr: FP Rate, tpr: TP Rate, thresholds: Pr(y=1)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.title('Receiver operating characteristic')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
return
#for categorical value
#top7_feats = top5_feats + ["income_category_High","income_category_Low"]
random_state = 1234
classifiers = []
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
classifiers.append(BaggingClassifier(random_state=random_state))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(BernoulliNB())
classifiers.append(GaussianNB())
classifiers.append(LinearSVC(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())
classifiers.append(xgb.XGBClassifier(objective="binary:logistic", random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(SVC(kernel='rbf',probability=True,random_state=random_state))
classifiers.append(lgb.LGBMClassifier(random_state=random_state))
classifier_names = ["DecisionTree","AdaBoost","BaggingClassifier","RandomForest","ExtraTrees","GradientBoosting"
,"KNeighboursClassifier","LogisticRegression","BernoulliNB","GaussianNB","LinearSVC"
,"LDA","XGB","MLPClassifier","SVC", "LGB"]
cv_average_results = dict()
cv_full_results = dict()
for i,classifier in enumerate(classifiers):
print(f"Training Classifier {classifier_names[i]}...")
try:
average_result,full_result = cross_validation_smote_modified(5, X_train_scaled[top5_feats].values, y_train.values, classifier)
acc = average_result["accuracy"]
f1 = average_result["f1_score"]
recall = average_result["recall_score"]
precision = average_result["precision"]
roc_auc = average_result['roc_auc_score']
mcc = average_result['mcc']
print(f"Mean Accuracy of {classifier_names[i]} is {acc}")
print(f"Mean F1 of {classifier_names[i]} is {f1}")
print(f"Mean Recall of {classifier_names[i]} is {recall}")
print(f"Mean Precision of {classifier_names[i]} is {precision}")
print(f"Mean ROC_AUC of {classifier_names[i]} is {roc_auc}")
print(f"Mean MCC of {classifier_names[i]} is {mcc}")
cv_average_results[classifier_names[i]] = average_result
cv_full_results[classifier_names[i]] = full_result
except:
print(f"{classifier_names[i]} Error")
cv_average_results[classifier_names[i]] = 0
cv_full_results[classifier_names[i]] = 0
cv_average_results
metrics = ["accuracy","f1_score","recall_score","precision","roc_auc_score","mcc"]
model_score_by_metric = []
for i in metrics:
sub = []
for j in classifier_names:
sub.append(cv_average_results[j][i])
model_score_by_metric.append(sub)
fig = plt.figure(constrained_layout=False, figsize=(20,20))
spec = fig.add_gridspec(3, 2, wspace=0.3, hspace=0.3)
for i,met in enumerate(metrics):
#print(met)
cv_res = pd.DataFrame({"CrossValScore":model_score_by_metric[i],"Algorithm":classifier_names})
#print(cv_res)
axes = fig.add_subplot(spec[i])
g = sns.barplot("CrossValScore","Algorithm",data = cv_res, palette="Set3",orient = "h")
g.set_xlabel(met)
if met == "accuracy":
g.set_xlim(0.8, 0.9)
elif met == "f1_score":
g.set_xlim(0.65, 0.75)
elif met == "recall_score":
g.set_xlim(0.7, 1)
elif met == "precision":
g.set_xlim(0.5,0.65)
elif met == "roc_auc_score":
g.set_xlim(0.8,0.95)
else:
g.set_xlim(0.5,0.7)
g = g.set_title(f"Comparison of {met} for all Models")
# Address the Unbalanced Training Set by Oversampling SMOTE
sm = SMOTE(random_state=1234)
sm_X_train, sm_y_train = sm.fit_sample(X_train_scaled[top5_feats], y_train.ravel())
from sklearn.model_selection import validation_curve
from imblearn import pipeline as pl
def plot_validation_curve(classifier, classifier_name, parameter, param_range, X_train, y_train, scoring, base):
# Create SMOTE pipeline
pipeline = pl.make_pipeline(SMOTE(random_state=1234), classifier)
# Create Stratified Kfold
skf = StratifiedKFold(n_splits=5,random_state=1234)
# Create validation curve
train_scores, test_scores = validation_curve(
pipeline,
X_train, y_train,
param_name = '{0}__{1}'.format(classifier_name, parameter),
param_range = param_range,
cv=skf,
scoring=scoring
)
# Create the graph
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
fig = plt.figure(figsize=(8, 3), dpi=100)
ax = plt.gca()
plt.title("Validation Curve with {}".format(classifier_name))
plt.xlabel(parameter)
plt.ylabel(scoring)
temp = list(train_scores_mean)
temp.extend(test_scores_mean)
temp2 = list(test_scores_std)
temp2.extend(train_scores_std)
max_y = max(temp) + (1.1*max(temp2))
min_y = min(temp) - (1.1*max(temp2))
plt.ylim(min_y, max_y)
if base == "norm":
plt.plot([str(i) for i in param_range],
train_scores_mean,
label="Training score",
color="r")
plt.plot([str(i) for i in param_range],
test_scores_mean,
label="Cross-validation score",
color="g")
plt.fill_between([str(i) for i in param_range],
train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std,
alpha=0.2, color="r")
plt.fill_between([str(i) for i in param_range],
test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std,
alpha=0.2, color="g")
else:
plt.plot([i for i in param_range],
train_scores_mean,
label="Training score",
color="r")
plt.plot([i for i in param_range],
test_scores_mean,
label="Cross-validation score",
color="g")
plt.fill_between([i for i in param_range],
train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std,
alpha=0.2, color="r")
plt.fill_between([i for i in param_range],
test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std,
alpha=0.2, color="g")
ax.set_xscale('log', basex = 2)
plt.axhline(y=1, color='k', ls='dashed')
plt.legend(loc="best")
plt.show()
i = np.argmax(test_scores_mean)
print("Best cross-validation result ({0:.2f}) obtained for {1}: {2}".format(test_scores_mean[i], parameter, param_range[i]))
return param_range[i]
def bayes_opt(classifier,param_space,X_train,y_train,metric):
def objective_function(param_space):
clf = classifier(random_state=1234, **param_space)
pipeline = Pipeline([
('sampling', SMOTE(random_state=1234)),
('classification', clf)
])
score = cross_val_score(pipeline, X_train,y_train, cv=5,scoring=metric).mean()
return {'loss': -score, 'status': STATUS_OK}
trials = Trials()
best_param = fmin(objective_function,
param_space,
algo=tpe.suggest,
max_evals=100,
trials=trials,
rstate= np.random.RandomState(1234)
)
loss = [x['result']['loss'] for x in trials.trials]
print("")
print("##### Results")
print("Score best parameters: ", min(loss)*-1)
print("Best parameters: ", best_param)
params_dic = {
"max_depth": np.arange(5,16,1),
"max_features": np.arange(3,6,1),
"min_samples_split": np.arange(10, 51, 10),
"min_samples_leaf": np.arange(10,31,10),
"n_estimators": np.arange(50,301, 50),
"bootstrap": np.array([False, True]),
"criterion": np.array(["gini", "entropy"])
}
classifier = RandomForestClassifier(random_state=1234)
classifier_name = "randomforestclassifier"
rf_best_params = {}
for param in params_dic:
parameter = param
param_range = params_dic[param]
if param != "C":
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","norm")
else:
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","log")
rf_best_params[param] = best_param
rf_param_space1 = {"max_depth": scope.int(hp.quniform('max_depth', 5, 15, 1)),
"max_features": scope.int(hp.quniform('max_features', 3, 5, 1)),
"min_samples_split": scope.int(hp.quniform("min_samples_split", 10, 50, 10)),
"min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 10,30,10)),
"bootstrap": hp.choice("bootstrap", [False,True]),
"n_estimators" : scope.int(hp.quniform("n_estimators", 50,300,50)),
"criterion": hp.choice("criterion", ["gini","entropy"])}
bayes_opt(RandomForestClassifier,rf_param_space1,X_train_scaled[top5_feats].values, y_train.values,"f1")
rf = RandomForestClassifier(bootstrap='True',criterion='gini',max_depth=14,max_features=5,min_samples_leaf=10,min_samples_split=40,n_estimators=100,random_state=1234)
rf.fit(sm_X_train, sm_y_train)
y_pred_rf = rf.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_rf)
cm_and_roc(cm, [0, 1], y_test, y_pred_rf, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("Sensitivity: ", recall_score(y_test, y_pred_rf))
print("Precision: ", precision_score(y_test, y_pred_rf))
print ("f1 Score: ", f1_score(y_test, y_pred_rf))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_rf))
print("MCC: ", matthews_corrcoef(y_test, y_pred_rf))
rf_param_space2 = {"max_depth": scope.int(hp.quniform('max_depth', 5, 15, 1)),
"max_features": scope.int(hp.quniform('max_features', 3, 5, 1)),
"min_samples_split": scope.int(hp.quniform("min_samples_split", 10, 50, 5)),
"min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 10,50,5)),
"bootstrap": hp.choice("bootstrap", [False,True]),
"n_estimators" : scope.int(hp.quniform("n_estimators", 50,200,25)),
"criterion": hp.choice("criterion", ["gini","entropy"])}
bayes_opt(RandomForestClassifier,rf_param_space2,X_train_scaled[top5_feats].values, y_train.values,"f1")
rf = RandomForestClassifier(bootstrap='True',criterion='gini',max_depth=9,max_features=5,min_samples_leaf=10,min_samples_split=40,n_estimators=150,random_state=1234)
rf.fit(sm_X_train, sm_y_train)
y_pred_rf = rf.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_rf)
cm_and_roc(cm, [0, 1], y_test, y_pred_rf, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("Sensitivity: ", recall_score(y_test, y_pred_rf))
print("Precision: ", precision_score(y_test, y_pred_rf))
print ("f1 Score: ", f1_score(y_test, y_pred_rf))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_rf))
print("MCC: ", matthews_corrcoef(y_test, y_pred_rf))
rf_param_space3 = {"max_depth": scope.int(hp.quniform('max_depth', 5, 15, 1)),
"max_features": scope.int(hp.quniform('max_features', 3, 5, 1)),
"min_samples_split": scope.int(hp.quniform("min_samples_split", 5, 40, 5)),
"min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 5,60,5)),
"bootstrap": hp.choice("bootstrap", [True]),
"n_estimators" : scope.int(hp.quniform("n_estimators", 100,250,10)),
"criterion": hp.choice("criterion", ["gini"])}
bayes_opt(RandomForestClassifier,rf_param_space3,X_train_scaled[top5_feats].values, y_train.values,"f1")
rf = RandomForestClassifier(bootstrap='True',criterion='gini',max_depth=10,max_features=5,min_samples_leaf=10,min_samples_split=40,n_estimators=210,random_state=1234)
rf.fit(sm_X_train, sm_y_train)
y_pred_rf = rf.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_rf)
cm_and_roc(cm, [0, 1], y_test, y_pred_rf, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("Sensitivity: ", recall_score(y_test, y_pred_rf))
print("Precision: ", precision_score(y_test, y_pred_rf))
print ("f1 Score: ", f1_score(y_test, y_pred_rf))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_rf))
print("MCC: ", matthews_corrcoef(y_test, y_pred_rf))
params_dic = {
"penalty": np.array(["l1", "l2"]),
"fit_intercept": np.array([True, False]),
"C": np.logspace(-5,5,500),
"max_iter": np.array([100,200,500,800,1000,1500])
}
classifier = LogisticRegression(random_state=1234)
classifier_name = "logisticregression"
log_best_params = {}
for param in params_dic:
parameter = param
param_range = params_dic[param]
if param != "C":
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","norm")
else:
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","log")
log_best_params[param] = best_param
log_best_params
log_param_space1 = {"penalty": hp.choice("penalty", ['l1','l2']),
"fit_intercept" : hp.choice("fit_intercept", [True,False]),
"C": hp.choice("C", list(np.logspace(-3,5,500))),
"max_iter":hp.choice("max_iter", [50,100,200,500]),
"n_jobs":hp.choice("n_jobs", [3])}
bayes_opt(LogisticRegression,log_param_space1,X_train_scaled[top5_feats].values, y_train.values,"f1")
lr = LogisticRegression(C=list(np.logspace(-3,5,500))[272],
fit_intercept=True,
max_iter=500,
n_jobs=3,
penalty='l2',
random_state=1234)
lr.fit(sm_X_train, sm_y_train)
y_pred_lr = lr.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_lr)
cm_and_roc(cm, [0, 1], y_test, y_pred_lr, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_lr))
print("Sensitivity: ", recall_score(y_test, y_pred_lr))
print("Precision: ", precision_score(y_test, y_pred_lr))
print ("f1 Score: ", f1_score(y_test, y_pred_lr))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_lr))
print("MCC: ", matthews_corrcoef(y_test, y_pred_lr))
log_param_space2 = {"penalty": hp.choice("penalty", ['l1','l2']),
"fit_intercept" : hp.choice("fit_intercept", [True,False]),
"C": hp.choice("C", list(np.logspace(-1,5,500))),
"max_iter":hp.choice("max_iter", [100,200,500,700,1000]),
"n_jobs":hp.choice("n_jobs", [3])}
bayes_opt(LogisticRegression,log_param_space2,X_train_scaled[top5_feats].values, y_train.values,"f1")
lr = LogisticRegression(C=list(np.logspace(-1,5,500))[198],
fit_intercept=True,
max_iter=100,
n_jobs=3,
penalty='l2',
random_state=1234)
lr.fit(sm_X_train, sm_y_train)
y_pred_lr = lr.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_lr)
cm_and_roc(cm, [0, 1], y_test, y_pred_lr, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_lr))
print("Sensitivity: ", recall_score(y_test, y_pred_lr))
print("Precision: ", precision_score(y_test, y_pred_lr))
print ("f1 Score: ", f1_score(y_test, y_pred_lr))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_lr))
print("MCC: ", matthews_corrcoef(y_test, y_pred_lr))
log_param_space3 = {"penalty": hp.choice("penalty", ['l1','l2']),
"fit_intercept" : hp.choice("fit_intercept", [True,False]),
"C": hp.choice("C", list(np.logspace(-1,3,500))),
"max_iter":hp.choice("max_iter", [50,75,100,200,500,700,1000]),
"n_jobs":hp.choice("n_jobs", [3])}
bayes_opt(LogisticRegression,log_param_space3,X_train_scaled[top5_feats].values, y_train.values,"f1")
lr = LogisticRegression(C=list(np.logspace(-1,3,500))[313], fit_intercept=True,max_iter=700,n_jobs=3,penalty='l2',random_state=1234)
lr.fit(sm_X_train, sm_y_train)
y_pred_lr = lr.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_lr)
cm_and_roc(cm, [0, 1], y_test, y_pred_lr, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_lr))
print("Sensitivity: ", recall_score(y_test, y_pred_lr))
print("Precision: ", precision_score(y_test, y_pred_lr))
print ("f1 Score: ", f1_score(y_test, y_pred_lr))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_lr))
print("MCC: ", matthews_corrcoef(y_test, y_pred_lr))
params_dic = {
"learning_rate": np.array([0.01,0.05,0.1,0.2,0.3,0.5,1]),
"gamma": np.arange(1,6,1),
"subsample": np.arange(0.5, 1.01, 0.1),
"colsample_bytree": np.arange(0.5, 1.01, 0.1),
"reg_alpha": np.array([1e-5, 1e-2, 0.1, 1, 100]),
"reg_lambda": np.array([1e-5, 1e-2, 0.1, 1, 100]),
"max_depth": np.arange(5,11,1),
"min_child_weight": np.arange(4,21,1),
"n_estimators": np.array([10,20,50,100,200])
}
classifier = xgb.XGBClassifier(random_state=1234)
classifier_name = "xgbclassifier"
xgb_best_params = {}
for param in params_dic:
parameter = param
param_range = params_dic[param]
if param != "C":
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","norm")
else:
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","log")
xgb_best_params[param] = best_param
xgb_best_params
xgb_param_space1 = {"learning_rate": hp.choice("learning_rate", [0.05, 0.07, 0.1, 0.15, 0.2]),
"min_child_weight": scope.int(hp.quniform("min_child_weight", 10,20,1)),
"gamma": scope.int(hp.quniform("gamma", 0,4,1)),
"subsample": hp.uniform("subsample", 0.6, 1.0),
"colsample_bytree": hp.uniform("colsample_by_tree", 0.4, 0.7),
"reg_alpha": hp.choice("reg_alpha", [0.1, 1, 100]),
"reg_lambda": hp.choice("reg_lambda", [1e-5, 1e-2, 0.1]),
"max_depth": scope.int(hp.quniform("max_depth", 7, 11, 1)),
"n_estimators": hp.choice("n_estimators", [50, 75, 100, 150, 200])
}
bayes_opt(XGBClassifier, xgb_param_space1, X_train_scaled[top5_feats].values, y_train.values, "f1")
xgb_model = XGBClassifier(random_state=1234,
colsample_bytree=0.6970729260476218,
gamma=3,
learning_rate=0.15,
max_depth=7,
min_child_weight=18,
n_estimators=100,
reg_alpha=1,
reg_lambda=1e-5,
subsample=0.7863244971678923
)
xgb_model.fit(sm_X_train, sm_y_train)
y_pred_xgb = xgb_model.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_xgb)
cm_and_roc(cm, [0, 1], y_test, y_pred_xgb, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_xgb))
print("Sensitivity: ", recall_score(y_test, y_pred_xgb))
print("Precision: ", precision_score(y_test, y_pred_xgb))
print ("f1 Score: ", f1_score(y_test, y_pred_xgb))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_xgb))
print("MCC: ", matthews_corrcoef(y_test, y_pred_xgb))
xgb_param_space2 = {"learning_rate": hp.choice("learning_rate", [0.1, 0.15, 0.2, 0.25, 0.3]),
"min_child_weight": scope.int(hp.quniform("min_child_weight", 18,25,1)),
"gamma": scope.int(hp.quniform("gamma", 3,7,1)),
"subsample": hp.uniform("subsample", 0.7, 1.0),
"colsample_bytree": hp.uniform("colsample_by_tree", 0.6, 0.7),
"reg_alpha": hp.choice("reg_alpha", [0.1, 0.5, 1, 1.5]),
"reg_lambda": hp.choice("reg_lambda", [1e-10, 1e-7, 1e-5, 1e-2]),
"max_depth": scope.int(hp.quniform("max_depth", 3, 7, 1)),
"n_estimators": hp.choice("n_estimators", [50, 75, 100, 150, 200])
}
bayes_opt(XGBClassifier, xgb_param_space2, X_train_scaled[top5_feats].values, y_train.values, "f1")
xgb_model = XGBClassifier(random_state=1234,
colsample_bytree=0.6653997176343666,
gamma=5,
learning_rate=0.3,
max_depth=5,
min_child_weight=19,
n_estimators=50,
reg_alpha=1.5,
reg_lambda=1e-2,
subsample=0.7680129070567101
)
xgb_model.fit(sm_X_train, sm_y_train)
y_pred_xgb = xgb_model.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_xgb)
cm_and_roc(cm, [0, 1], y_test, y_pred_xgb, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_xgb))
print("Sensitivity: ", recall_score(y_test, y_pred_xgb))
print("Precision: ", precision_score(y_test, y_pred_xgb))
print ("f1 Score: ", f1_score(y_test, y_pred_xgb))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_xgb))
print("MCC: ", matthews_corrcoef(y_test, y_pred_xgb))
xgb_param_space3 = {"learning_rate": hp.choice("learning_rate", [0.3, 0.35, 0.4, 0.45, 0.5]),
"min_child_weight": scope.int(hp.quniform("min_child_weight", 19,25,1)),
"gamma": scope.int(hp.quniform("gamma", 5,10,1)),
"subsample": hp.uniform("subsample", 0.7, 1.0),
"colsample_bytree": hp.uniform("colsample_by_tree", 0.6, 0.8),
"reg_alpha": hp.choice("reg_alpha", [1.5, 2, 2.5, 3]),
"reg_lambda": hp.choice("reg_lambda", [1e-2, 0.1, 1]),
"max_depth": scope.int(hp.quniform("max_depth", 5, 10, 1)),
"n_estimators": hp.choice("n_estimators", [50, 75, 100, 150, 200])
}
bayes_opt(XGBClassifier, xgb_param_space3, X_train_scaled[top5_feats].values, y_train.values, "f1")
xgb_model = XGBClassifier(random_state=1234,
colsample_bytree=0.7465937617317032,
gamma=7,
learning_rate=0.4,
max_depth=6,
min_child_weight=24,
n_estimators=200,
reg_alpha=1.5,
reg_lambda=1,
subsample=0.7069236585610036
)
xgb_model.fit(sm_X_train, sm_y_train)
y_pred_xgb = xgb_model.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_xgb)
cm_and_roc(cm, [0, 1], y_test, y_pred_xgb, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_xgb))
print("Sensitivity: ", recall_score(y_test, y_pred_xgb))
print("Precision: ", precision_score(y_test, y_pred_xgb))
print ("f1 Score: ", f1_score(y_test, y_pred_xgb))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_xgb))
print("MCC: ", matthews_corrcoef(y_test, y_pred_xgb))
params_dic = {
"gamma": np.array([0.01,0.05,0.07,0.1,1]),
"C": np.array([0.001, 0.01, 0.1, 1, 10]),
"kernel": np.array(["linear", "poly", "rbf", "sigmoid"])
}
classifier = SVC(random_state=1234)
classifier_name = "svc"
svm_best_params = {}
for param in params_dic:
parameter = param
param_range = params_dic[param]
if param != "C":
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","norm")
else:
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","log")
svm_best_params[param] = best_param
svc_param_space1 = {"probability": hp.choice("probability", [True]),
"gamma" : hp.choice("gamma", [0.05,0.07,0.08,0.1,0.5,1]),
"C": hp.choice("C", [0.1, 1, 10,50,100]),
"kernel": hp.choice("kernel", ["linear", "rbf"])
}
bayes_opt(SVC,svc_param_space1,X_train_scaled[top5_feats].values, y_train.values,"f1")
svm = SVC(C=100,gamma=0.08,kernel='rbf',probability=True,random_state=1234)
svm.fit(sm_X_train, sm_y_train)
y_pred_svm = svm.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_svm)
cm_and_roc(cm, [0, 1], y_test, y_pred_svm, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_svm))
print("Sensitivity: ", recall_score(y_test, y_pred_svm))
print("Precision: ", precision_score(y_test, y_pred_svm))
print ("f1 Score: ", f1_score(y_test, y_pred_svm))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_svm))
print("MCC: ", matthews_corrcoef(y_test, y_pred_svm))
svc_param_space2 = {"probability": hp.choice("probability", [True]),
"gamma" : hp.choice("gamma", [0.07,0.075,0.08,0.09]),
"C": hp.choice("C", [100,125,150,200]),
"kernel": hp.choice("kernel", ["linear", "rbf"])
}
bayes_opt(SVC,svc_param_space2,X_train_scaled[top5_feats].values, y_train.values,"f1")
svm = SVC(C=125,gamma=0.075,kernel='rbf',probability=True,random_state=1234)
svm.fit(sm_X_train, sm_y_train)
y_pred_svm = svm.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_svm)
cm_and_roc(cm, [0, 1], y_test, y_pred_svm, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_svm))
print("Sensitivity: ", recall_score(y_test, y_pred_svm))
print("Precision: ", precision_score(y_test, y_pred_svm))
print ("f1 Score: ", f1_score(y_test, y_pred_svm))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_svm))
print("MCC: ", matthews_corrcoef(y_test, y_pred_svm))
svc_param_space3 = {"probability": hp.choice("probability", [True]),
"gamma" : hp.choice("gamma", [0.07,0.072,0.075,0.078,0.08,0.85]),
"C": hp.choice("C", [10,30,50,120,125,150,300]),
"kernel": hp.choice("kernel", ["rbf"])
}
bayes_opt(SVC,svc_param_space3,X_train_scaled[top5_feats].values, y_train.values,"f1")
svm = SVC(C=125,gamma=0.075,kernel='rbf',probability=True,random_state=1234)
svm.fit(sm_X_train, sm_y_train)
y_pred_svm = svm.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_svm)
cm_and_roc(cm, [0, 1], y_test, y_pred_svm, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_svm))
print("Sensitivity: ", recall_score(y_test, y_pred_svm))
print("Precision: ", precision_score(y_test, y_pred_svm))
print ("f1 Score: ", f1_score(y_test, y_pred_svm))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_svm))
print("MCC: ", matthews_corrcoef(y_test, y_pred_svm))
params_dic = {
"hidden_layer_sizes": np.array([(50,50,50), (50,100,50), (100,)]),
"activation": np.array(['logistic', 'tanh', 'relu']),
"solver": np.array(['lbfgs', 'sgd', 'adam']),
"learning_rate": np.array(['constant','adaptive']),
"alpha": np.array([0.0001, 0.001, 0.01, 0.05]),
"batch_size": np.array([32,64,128,256,512])
}
classifier = MLPClassifier(random_state=1234)
classifier_name = "mlpclassifier"
mlp_best_params = {}
for param in params_dic:
parameter = param
param_range = params_dic[param]
if param != "C":
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","norm")
else:
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","log")
mlp_best_params[param] = best_param
params_dic = {
"max_depth": np.arange(5,16,1),
"max_features": np.arange(3,6,1),
"criterion": np.array(["gini", "entropy"]),
"min_samples_split": np.arange(5,51,10),
"min_samples_leaf": np.arange(5,51,5),
"min_impurity_decrease": np.array([0,1e-07,1e-05,0.001])
}
classifier = DecisionTreeClassifier(random_state=1234)
classifier_name = "decisiontreeclassifier"
dt_best_params = {}
for param in params_dic:
parameter = param
param_range = params_dic[param]
if param != "C":
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","norm")
else:
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","log")
dt_best_params[param] = best_param
DT_param_space1 = {"max_depth": scope.int(hp.quniform('max_depth', 3, 10, 1)),
"max_features": scope.int(hp.quniform('max_features', 3, 5, 1)),
"criterion": hp.choice("criterion", ["gini","entropy"]),
"min_samples_split": scope.int(hp.quniform("min_samples_split", 5, 50, 10)),
"min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 30,60,5)),
"min_impurity_decrease": hp.choice("min_impurity_decrease",[0,1e-05,0.001,0.005,0.01])}
bayes_opt(DecisionTreeClassifier,DT_param_space1,X_train_scaled[top5_feats].values, y_train.values,"f1")
dt = DecisionTreeClassifier(criterion='entropy', max_depth=7,max_features=3,min_impurity_decrease=0.001,min_samples_leaf=30,min_samples_split=10,random_state=1234)
dt.fit(sm_X_train, sm_y_train)
y_pred_dt = dt.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_dt)
cm_and_roc(cm, [0, 1], y_test, y_pred_dt, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_dt))
print("Sensitivity: ", recall_score(y_test, y_pred_dt))
print("Precision: ", precision_score(y_test, y_pred_dt))
print ("f1 Score: ", f1_score(y_test, y_pred_dt))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_dt))
print("MCC: ", matthews_corrcoef(y_test, y_pred_dt))
DT_param_space2 = {"max_depth": scope.int(hp.quniform('max_depth', 5, 15, 1)),
"max_features": scope.int(hp.quniform('max_features', 3, 5, 1)),
"criterion": hp.choice("criterion", ["gini","entropy"]),
"min_samples_split": scope.int(hp.quniform("min_samples_split", 5, 50, 10)),
"min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 5,50,5)),
"min_impurity_decrease": hp.choice("min_impurity_decrease",[0,1e-05,0.001,0.005,0.01])}
bayes_opt(DecisionTreeClassifier,DT_param_space2,X_train_scaled[top5_feats].values, y_train.values,"f1")
dt = DecisionTreeClassifier(criterion='gini', max_depth=5,max_features=3,min_impurity_decrease=1e-05,min_samples_leaf=40,min_samples_split=30,random_state=1234)
dt.fit(sm_X_train, sm_y_train)
y_pred_dt = dt.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_dt)
cm_and_roc(cm, [0, 1], y_test, y_pred_dt, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_dt))
print("Sensitivity: ", recall_score(y_test, y_pred_dt))
print("Precision: ", precision_score(y_test, y_pred_dt))
print ("f1 Score: ", f1_score(y_test, y_pred_dt))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_dt))
print("MCC: ", matthews_corrcoef(y_test, y_pred_dt))
DT_param_space3 = {"max_depth": scope.int(hp.quniform('max_depth', 3, 15, 1)),
"max_features": scope.int(hp.quniform('max_features', 3, 5, 1)),
"criterion": hp.choice("criterion", ["gini","entropy"]),
"min_samples_split": scope.int(hp.quniform("min_samples_split", 5, 50, 10)),
"min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 5,50,5)),
"min_impurity_decrease": hp.choice("min_impurity_decrease",[0,1e-07,1e-05,0.001,0.005])}
bayes_opt(DecisionTreeClassifier,DT_param_space3,X_train_scaled[top5_feats].values, y_train.values,"f1")
dt = DecisionTreeClassifier(criterion='gini', max_depth=7,max_features=4,min_impurity_decrease=1e-07,min_samples_leaf=25,min_samples_split=20,random_state=1234)
dt.fit(sm_X_train, sm_y_train)
y_pred_dt = dt.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_dt)
cm_and_roc(cm, [0, 1], y_test, y_pred_dt, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_dt))
print("Sensitivity: ", recall_score(y_test, y_pred_dt))
print("Precision: ", precision_score(y_test, y_pred_dt))
print ("f1 Score: ", f1_score(y_test, y_pred_dt))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_dt))
print("MCC: ", matthews_corrcoef(y_test, y_pred_dt))
DT_param_space4 = {"max_depth": scope.int(hp.quniform('max_depth', 3, 15, 1)),
"max_features": scope.int(hp.quniform('max_features', 3, 5, 1)),
"criterion": hp.choice("criterion", ["gini"]),
"min_samples_split": scope.int(hp.quniform("min_samples_split", 5, 80, 10)),
"min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 5,50,5)),
"min_impurity_decrease": hp.choice("min_impurity_decrease",[0,1e-07,1e-06,1e-05,1e-03,0.001])}
bayes_opt(DecisionTreeClassifier,DT_param_space4,X_train_scaled[top5_feats].values, y_train.values,"f1")
dt = DecisionTreeClassifier(criterion='gini', max_depth=11,max_features=3,min_impurity_decrease=1e-07,min_samples_leaf=20,min_samples_split=80,random_state=1234)
dt.fit(sm_X_train, sm_y_train)
y_pred_dt = dt.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_dt)
cm_and_roc(cm, [0, 1], y_test, y_pred_dt, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_dt))
print("Sensitivity: ", recall_score(y_test, y_pred_dt))
print("Precision: ", precision_score(y_test, y_pred_dt))
print ("f1 Score: ", f1_score(y_test, y_pred_dt))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_dt))
print("MCC: ", matthews_corrcoef(y_test, y_pred_dt))
DT_param_space5 = {"max_depth": scope.int(hp.quniform('max_depth', 3, 15, 1)),
"max_features": scope.int(hp.quniform('max_features', 3, 5, 1)),
"criterion": hp.choice("criterion", ["gini"]),
"min_samples_split": scope.int(hp.quniform("min_samples_split", 5, 100, 10)),
"min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 5,50,5)),
"min_impurity_decrease": hp.choice("min_impurity_decrease",[0,1e-08,1e-07,1e-06,1e-05,1e-04,1e-03])}
bayes_opt(DecisionTreeClassifier,DT_param_space5,X_train_scaled[top5_feats].values, y_train.values,"f1")
dt = DecisionTreeClassifier(criterion='gini', max_depth=10,max_features=3,min_impurity_decrease=1e-04,min_samples_leaf=15,min_samples_split=40,random_state=1234)
dt.fit(sm_X_train, sm_y_train)
y_pred_dt = dt.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_dt)
cm_and_roc(cm, [0, 1], y_test, y_pred_dt, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_dt))
print("Sensitivity: ", recall_score(y_test, y_pred_dt))
print("Precision: ", precision_score(y_test, y_pred_dt))
print ("f1 Score: ", f1_score(y_test, y_pred_dt))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_dt))
print("MCC: ", matthews_corrcoef(y_test, y_pred_dt))
DT = DecisionTreeClassifier(criterion="gini",
max_depth=11,
max_features=3,
min_impurity_decrease=1e-07,
min_samples_leaf=20,
min_samples_split=80,
random_state=1234)
params_dic = {
#"max_features": np.arange(3,6,1),
"bootstrap": np.array([False, True]),
"n_estimators": np.array([10,20,30,50,100])
}
classifier = BaggingClassifier(DT, random_state=1234)
classifier_name = "baggingclassifier"
bag_best_params = {}
for param in params_dic:
parameter = param
param_range = params_dic[param]
if param != "C":
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","norm")
else:
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","log")
bag_best_params[param] = best_param
bagging_param_space1 = {"bootstrap": hp.choice("bootstrap", [False,True]),
"n_estimators" : hp.choice("n_estimators", [10,15,20,25,30]),
"n_jobs":hp.choice("n_jobs", [3]),
"base_estimator":hp.choice("base_estimator", [DT])}
bayes_opt(BaggingClassifier,bagging_param_space1,X_train_scaled[top5_feats].values, y_train.values,"f1")
bc = BaggingClassifier(DT,bootstrap = False, n_estimators =10 ,random_state=1234)
bc.fit(sm_X_train, sm_y_train)
y_pred_bc = bc.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_bc)
cm_and_roc(cm, [0, 1], y_test, y_pred_bc, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_bc))
print("Sensitivity: ", recall_score(y_test, y_pred_bc))
print("Precision: ", precision_score(y_test, y_pred_bc))
print ("f1 Score: ", f1_score(y_test, y_pred_bc))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_bc))
print("MCC: ", matthews_corrcoef(y_test, y_pred_bc))
bagging_param_space2 = {"bootstrap": hp.choice("bootstrap", [False,True]),
"n_estimators" : hp.choice("n_estimators", [3,4,5,10,15,20,50,100]),
"n_jobs":hp.choice("n_jobs", [3]),
"base_estimator":hp.choice("base_estimator", [DT])}
bayes_opt(BaggingClassifier,bagging_param_space2,X_train_scaled[top5_feats].values, y_train.values,"f1")
bc = BaggingClassifier(DT,bootstrap = False, n_estimators =5 ,random_state=1234)
bc.fit(sm_X_train, sm_y_train)
y_pred_bc = bc.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_bc)
cm_and_roc(cm, [0, 1], y_test, y_pred_bc, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_bc))
print("Sensitivity: ", recall_score(y_test, y_pred_bc))
print("Precision: ", precision_score(y_test, y_pred_bc))
print ("f1 Score: ", f1_score(y_test, y_pred_bc))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_bc))
print("MCC: ", matthews_corrcoef(y_test, y_pred_bc))
params_dic = {"boosting_type": np.array(["gbdt","dart","goss"]),
'num_leaves': np.arange(5, 50, 1),
"learning_rate":np.array([0.01,0.05,0.1,0.2,0.3,0.5,1]),
"gamma":np.arange(1, 5, 1),
'subsample':np.arange(0.5, 1.0,0.1),
'colsample_bytree': np.arange(0.5, 1.0,0.1),
'reg_alpha': np.array([1e-5, 1e-2, 0.1, 1, 100]),
'reg_lambda':np.array([1e-5, 1e-2, 0.1, 1, 100]),
"max_depth": np.arange(5, 10, 1),
'min_child_weight': np.arange(1, 6, 1),
"n_estimators" : np.array([10,20,50,100,200]),
"min_split_gain": np.array([0.001,0.01,0.05,0.1,0.5,1]),
"min_child_samples":np.arange(5, 30, 5)}
classifier = lgb.LGBMClassifier(random_state=1234)
classifier_name = "lgbmclassifier"
lgb_best_params = {}
for param in params_dic:
parameter = param
param_range = params_dic[param]
if param != "C":
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","norm")
else:
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
X_train_scaled[top5_feats].values, y_train.values, "f1","log")
lgb_best_params[param] = best_param
lgb_param_space1 = {"boosting_type": hp.choice("boosting_type",["gbdt","dart","goss"]),
'num_leaves': scope.int(hp.quniform('num_leaves', 30, 50, 2)),
"learning_rate":hp.choice("learning_rate", [0.01,0.05,0.1,0.2,0.3]),
"gamma":scope.int(hp.quniform('gamma', 1, 5, 1)),
'colsample_bytree': hp.uniform('colsample_by_tree', 0.5, 1.0),
'reg_alpha': hp.choice('reg_alpha', [1e-2, 0.1, 1, 100,200,500]),
'reg_lambda':hp.choice('reg_lambda', [1e-2, 0.1, 1, 100,200,500]),
"max_depth": scope.int(hp.quniform('max_depth', 5, 10, 1)),
'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 7, 1)),
"n_estimators" : hp.choice("n_estimators", [5,10,20,50,100]),
"min_split_gain": hp.choice('min_split_gain',[0.01,0.05,0.1,0.5,1]),
"min_child_samples":scope.int(hp.quniform('min_child_samples', 5, 30, 5)),
"n_job":hp.choice('n_job',[3])}
bayes_opt(lgb.LGBMClassifier,lgb_param_space1,X_train_scaled[top5_feats].values, y_train.values,"f1")
lg = LGBMClassifier(boosting_type="gbdt",colsample_by_tree = 0.854336569496965,gamma=4,learning_rate=0.05,
max_depth=6,min_child_samples=25,min_child_weight=6,min_split_gain=0.5,n_estimator=20,num_leaves=42,
n_job=3,reg_alpha=1,reg_lambda=1e-2,
random_state=1234)
lg.fit(sm_X_train, sm_y_train)
y_pred_lg = lg.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_lg)
cm_and_roc(cm, [0, 1], y_test, y_pred_lg, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_lg))
print("Sensitivity: ", recall_score(y_test, y_pred_lg))
print("Precision: ", precision_score(y_test, y_pred_lg))
print ("f1 Score: ", f1_score(y_test, y_pred_lg))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_lg))
print("MCC: ", matthews_corrcoef(y_test, y_pred_lg))
lgb_param_space2 = {"boosting_type": hp.choice("boosting_type",["gbdt","dart","goss"]),
'num_leaves': scope.int(hp.quniform('num_leaves', 30, 50, 2)),
"learning_rate":hp.choice("learning_rate", [0.01,0.025,0.05,0.075,0.1,0.2]),
"gamma":scope.int(hp.quniform('gamma', 1, 5, 1)),
'colsample_bytree': hp.uniform('colsample_by_tree', 0.5, 1.0),
'reg_alpha': hp.choice('reg_alpha', [1e-05,1e-03,1e-2, 0.1, 1, 10]),
'reg_lambda':hp.choice('reg_lambda', [1e-05,1e-03,1e-2, 0.1, 1, 10]),
"max_depth": scope.int(hp.quniform('max_depth', 5, 10, 1)),
'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 7, 1)),
"n_estimators" : hp.choice("n_estimators", [10,20,40,50,100]),
"min_split_gain": hp.choice('min_split_gain',[0.01,0.025,0.05,0.075,0.1,0.5]),
"min_child_samples":scope.int(hp.quniform('min_child_samples', 10, 30, 5)),
"n_job":hp.choice('n_job',[3])}
bayes_opt(lgb.LGBMClassifier,lgb_param_space2,X_train_scaled[top5_feats].values, y_train.values,"f1")
lg = LGBMClassifier(boosting_type="gbdt",colsample_by_tree =0.8056846435332191,gamma=3,learning_rate=0.075,
max_depth=6,min_child_samples=30,min_child_weight=7,min_split_gain=0.05,n_estimator=20,num_leaves=32,
n_job=3,reg_alpha=1,reg_lambda=1e-05,
random_state=1234)
lg.fit(sm_X_train, sm_y_train)
y_pred_lg = lg.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_lg)
cm_and_roc(cm, [0, 1], y_test, y_pred_lg, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_lg))
print("Sensitivity: ", recall_score(y_test, y_pred_lg))
print("Precision: ", precision_score(y_test, y_pred_lg))
print ("f1 Score: ", f1_score(y_test, y_pred_lg))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_lg))
print("MCC: ", matthews_corrcoef(y_test, y_pred_lg))
lgb_param_space3 = {"boosting_type": hp.choice("boosting_type",["gbdt","dart","goss"]),
'num_leaves': scope.int(hp.quniform('num_leaves', 30, 50, 2)),
"learning_rate":hp.choice("learning_rate", [0.01,0.025,0.05,0.075,0.1,0.2]),
"gamma":scope.int(hp.quniform('gamma', 1, 5, 1)),
'colsample_bytree': hp.uniform('colsample_by_tree', 0.5, 1.0),
'reg_alpha': hp.choice('reg_alpha', [1e-2, 0.1, 1, 10]),
'reg_lambda':hp.choice('reg_lambda', [0,1e-05,1e-03,1e-2, 0.1, 1]),
"max_depth": scope.int(hp.quniform('max_depth', 5, 10, 1)),
'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 7, 1)),
"n_estimators" : hp.choice("n_estimators", [15,20,25,30,40]),
"min_split_gain": hp.choice('min_split_gain',[0.01,0.025,0.05,0.075,0.1,0.5]),
"min_child_samples":scope.int(hp.quniform('min_child_samples', 10, 30, 5)),
"n_job":hp.choice('n_job',[3])}
bayes_opt(lgb.LGBMClassifier,lgb_param_space3,X_train_scaled[top5_feats].values, y_train.values,"f1")
lg = LGBMClassifier(boosting_type="dart",colsample_by_tree =0.8591725014391239,gamma=2,learning_rate=0.075,
max_depth=6,min_child_samples=30,min_child_weight=2,min_split_gain=0.5,n_estimator=20,num_leaves=30,
n_job=3,reg_alpha=1e-2,reg_lambda=1,
random_state=1234)
lg.fit(sm_X_train, sm_y_train)
y_pred_lg = lg.predict(X_test_scaled[top5_feats].values)
cm = confusion_matrix(y_test, y_pred_lg)
cm_and_roc(cm, [0, 1], y_test, y_pred_lg, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_lg))
print("Sensitivity: ", recall_score(y_test, y_pred_lg))
print("Precision: ", precision_score(y_test, y_pred_lg))
print ("f1 Score: ", f1_score(y_test, y_pred_lg))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_lg))
print("MCC: ", matthews_corrcoef(y_test, y_pred_lg))
# Class to extend the Sklearn classifier
class SklearnHelper(object):
def __init__(self, clf, seed=1234, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def predict_proba(self, x):
return self.clf.predict_proba(x)
def fit(self,x,y):
return self.clf.fit(x,y)
def feature_importances(self,x,y):
print(self.clf.fit(x,y).feature_importances_)
This function is a modification of the cross validation that enables only the training set in each fold to undergo Oversampling (SMOTE) to address the unbalanced training set in each fold. This is to ensure that there is no information lekage from the validation set to training set in each fold.
# from imblearn.pipeline import make_pipeline as make_pipeline_imb, Pipeline
def get_oof(classifier, X_train, y_train, X_test, ntrain, ntest):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((5, ntest))
skf = StratifiedKFold(n_splits=5,random_state=1234)
for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
# train-test split in each fold
X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
# SMOTE oversampling on the training set of the fold
pipeline = make_pipeline_imb(SMOTE(random_state = 1234), classifier)
model = pipeline.fit(X_train_cv, y_train_cv)
oof_train[test_index] = model.predict(X_test_cv)
oof_test_skf[i, :] = model.predict(X_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1,1), oof_test.reshape(-1,1)
# Random Forest parameters - CV: 71.63% Test: 67.76%
rf_params = {
'bootstrap':True,
'criterion':'gini',
'max_depth':10,
'max_features':5,
'min_samples_leaf':10,
'min_samples_split':40,
'n_estimators':210,
'random_state':1234,
}
# Extreme Gradient Boosting parameters - CV: 71.99% Test: 68.06%
xgb_params = {
'colsample_by_tree': 0.7465937617317032,
'gamma': 7.0,
'learning_rate': 0.4,
'max_depth': 6,
'min_child_weight': 24,
'n_estimators': 200,
'reg_alpha': 1.5,
'reg_lambda': 1,
'subsample': 0.7069236585610036,
'random_state': 1234
}
# DT parameters - CV: 72.06% Test: 68.29%
dt_params = {
'criterion': 'gini',
'max_depth': 11,
'max_features': 3,
'min_impurity_decrease': 1e-07,
'min_samples_leaf': 20,
'min_samples_split': 80,
'random_state' :1234
}
# Bagging parameters - CV: 71.24% Test: 71.35%
bag_params = {
'base_estimator':DecisionTreeClassifier(criterion="gini",
max_depth=11,
max_features=3,
min_impurity_decrease=1e-07,
min_samples_leaf=20,
min_samples_split=80,
random_state=1234),
'bootstrap': False,
'n_estimators': 5,
'random_state' :1234
}
# lgb Parameters - CV: 72.1% Test: 69.27%
lgb_params = {
'boosting_type': 'gbdt',
'colsample_by_tree': 0.8309554549705769,
'gamma': 5.0,
'learning_rate': 0.05,
'max_depth': 6,
'min_child_samples': 25,
'min_child_weight': 6,
'min_split_gain': 0.5,
'n_estimators': 20,
'num_leaves': 42,
'reg_alpha': 1,
'reg_lambda': 1e-02,
'random_state' :1234,
'n_jobs':3
}
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=1234, params=rf_params)
xgb = SklearnHelper(clf=XGBClassifier, seed=1234, params=xgb_params)
dt = SklearnHelper(clf=DecisionTreeClassifier, seed=1234, params=dt_params)
bag = SklearnHelper(clf=BaggingClassifier, seed=1234, params=bag_params)
lgb = SklearnHelper(clf=LGBMClassifier, seed=1234, params=lgb_params)
ntrain = len(X_train_scaled)
ntest = len(X_test_scaled)
# Create our OOF train and test predictions. These base results will be used as new features
rf_oof_train, rf_oof_test = get_oof(rf, X_train_scaled.values, y_train.values, X_test_scaled.values, ntrain, ntest) # Random Forest
xgb_oof_train, xgb_oof_test = get_oof(xgb, X_train_scaled.values, y_train.values, X_test_scaled.values, ntrain, ntest) # AdaBoost
dt_oof_train, dt_oof_test = get_oof(dt, X_train_scaled.values, y_train.values, X_test_scaled.values, ntrain, ntest) # Gradient Boost
bag_oof_train, bag_oof_test = get_oof(bag, X_train_scaled.values, y_train.values, X_test_scaled.values, ntrain, ntest) # log reg Classifier
lgb_oof_train, lgb_oof_test = get_oof(lgb, X_train_scaled.values, y_train.values, X_test_scaled.values, ntrain, ntest) # log reg Classifier
print("Training is complete")
final_X_train = np.concatenate((rf_oof_train, xgb_oof_train, dt_oof_train, bag_oof_train, lgb_oof_train), axis=1)
final_X_test = np.concatenate((rf_oof_test, xgb_oof_test, dt_oof_test, bag_oof_test, lgb_oof_test), axis=1)
# SMOTE oversample the final_X_train
# Address the Unbalanced Training Set by Oversampling SMOTE
sm = SMOTE(random_state=1234)
sm_final_X_train, sm_final_y_train = sm.fit_sample(final_X_train, y_train.ravel())
params_dic = {
"learning_rate": np.array([0.01,0.05,0.1,0.2,0.3,0.5,1]),
"gamma": np.arange(1,6,1),
"subsample": np.arange(0.5, 1.01, 0.1),
"colsample_bytree": np.arange(0.5, 1.01, 0.1),
"reg_alpha": np.array([1e-5, 1e-2, 0.1, 1, 100]),
"reg_lambda": np.array([1e-5, 1e-2, 0.1, 1, 100]),
"max_depth": np.arange(5,11,1),
"min_child_weight": np.arange(4,21,1),
"n_estimators": np.array([10,20,50,100,200])
}
classifier = XGBClassifier(random_state=1234)
classifier_name = "xgbclassifier"
xgb_best_params = {}
for param in params_dic:
parameter = param
param_range = params_dic[param]
if param != "C":
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
final_X_train, y_train.values, "f1","norm")
else:
best_param = plot_validation_curve(classifier, classifier_name, parameter, param_range,
final_X_train, y_train.values, "f1","log")
xgb_best_params[param] = best_param
xgb_best_params
xgb_param_space1 = {"learning_rate": hp.choice("learning_rate", [0.05, 0.07, 0.1, 0.15, 0.2]),
"min_child_weight": scope.int(hp.quniform("min_child_weight", 10,20,1)),
"gamma": scope.int(hp.quniform("gamma", 0,4,1)),
"subsample": hp.uniform("subsample", 0.6, 1.0),
"colsample_bytree": hp.uniform("colsample_by_tree", 0.4, 1),
"reg_alpha": hp.choice("reg_alpha", [1e-5, 0.1, 1, 100]),
"reg_lambda": hp.choice("reg_lambda", [1e-5, 1e-2, 0.1]),
"max_depth": scope.int(hp.quniform("max_depth", 1, 10, 1)),
"n_estimators": hp.choice("n_estimators", [50, 75, 100, 150, 200])
}
bayes_opt(XGBClassifier, xgb_param_space1, final_X_train, y_train.values, "f1")
xgb_model = XGBClassifier(random_state=1234,
colsample_bytree=0.6209505599689564,
gamma=1,
learning_rate=0.2,
max_depth=8,
min_child_weight=16,
n_estimators=100,
reg_alpha=1,
reg_lambda=1e-2,
subsample=0.6741648759414965
)
xgb_model.fit(sm_final_X_train, sm_final_y_train)
y_pred_xgb = xgb_model.predict(final_X_test)
cm = confusion_matrix(y_test, y_pred_xgb)
cm_and_roc(cm, [0, 1], y_test, y_pred_xgb, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_xgb))
print("Sensitivity: ", recall_score(y_test, y_pred_xgb))
print("Precision: ", precision_score(y_test, y_pred_xgb))
print ("f1 Score: ", f1_score(y_test, y_pred_xgb))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_xgb))
print("MCC: ", matthews_corrcoef(y_test, y_pred_xgb))
xgb_param_space2 = {"learning_rate": hp.choice("learning_rate", [0.2, 0.25, 0.3, 0.35, 0.4]),
"min_child_weight": scope.int(hp.quniform("min_child_weight", 16,25,1)),
"gamma": scope.int(hp.quniform("gamma", 0,4,1)),
"subsample": hp.uniform("subsample", 0.3, 0.7),
"colsample_bytree": hp.uniform("colsample_by_tree", 0.4, 1),
"reg_alpha": hp.choice("reg_alpha", [1e-5, 0.1, 0.5, 1, 1.5, 2]),
"reg_lambda": hp.choice("reg_lambda", [1e-5, 1e-2, 0.1]),
"max_depth": scope.int(hp.quniform("max_depth", 8, 20, 1)),
"n_estimators": hp.choice("n_estimators", [50, 75, 100, 150, 200])
}
bayes_opt(XGBClassifier, xgb_param_space2, final_X_train, y_train.values, "f1")
xgb_model = XGBClassifier(random_state=1234,
colsample_bytree=0.7891506999960809,
gamma=1,
learning_rate=0.2,
max_depth=16,
min_child_weight=19,
n_estimators=150,
reg_alpha=0.1,
reg_lambda=1e-2,
subsample=0.456421525369848
)
xgb_model.fit(sm_final_X_train, sm_final_y_train)
y_pred_xgb = xgb_model.predict(final_X_test)
cm = confusion_matrix(y_test, y_pred_xgb)
cm_and_roc(cm, [0, 1], y_test, y_pred_xgb, figsize = (15,5), fontsize=12)
print ("Accuracy: ", accuracy_score(y_test, y_pred_xgb))
print("Sensitivity: ", recall_score(y_test, y_pred_xgb))
print("Precision: ", precision_score(y_test, y_pred_xgb))
print ("f1 Score: ", f1_score(y_test, y_pred_xgb))
print("ROC_AUC_Score:", roc_auc_score(y_test, y_pred_xgb))
print("MCC: ", matthews_corrcoef(y_test, y_pred_xgb))